library(tidyverse)
library(maps) # The maps package has geographic information on all U.S states
library(ggmap)
library(tmap)
library(rgdal)
library(rgeos)
library(geojsonio)
library(viridis)
library(hrbrthemes)
library(tm)
library(sp)
library(DT)
library(devtools)
library(leaflet)
library(ggthemes)
library(RColorBrewer)
# use getwd() and setwd() to change view and change working directory
data<-read.csv("data/airbnb_listings/airbnb_listings.csv")
airbnbnyc <- data%>%
select('id','host_id','host_listings_count','latitude','longitude','room_type','accommodates','bathrooms','bedrooms','price','neighbourhood_cleansed','neighbourhood_group_cleansed','availability_365','number_of_reviews','review_scores_rating','transit')
head(airbnbnyc)
## id host_id host_listings_count latitude longitude room_type
## 1 2539 2787 6 40.64749 -73.97237 Private room
## 2 2595 2845 5 40.75362 -73.98377 Entire home/apt
## 3 3647 4632 1 40.80902 -73.94190 Private room
## 4 3831 4869 1 40.68514 -73.95976 Entire home/apt
## 5 4989 7118 1 40.76260 -73.99304 Entire home/apt
## 6 5022 7192 1 40.79851 -73.94399 Entire home/apt
## accommodates bathrooms bedrooms price neighbourhood_cleansed
## 1 2 1 1 $149.00 Kensington
## 2 2 1 0 $225.00 Midtown
## 3 2 1 1 $150.00 Harlem
## 4 3 1 1 $89.00 Clinton Hill
## 5 2 1 1 $105.00 Hell's Kitchen
## 6 1 1 NA $80.00 East Harlem
## neighbourhood_group_cleansed availability_365 number_of_reviews
## 1 Brooklyn 365 9
## 2 Manhattan 331 44
## 3 Manhattan 365 0
## 4 Brooklyn 182 258
## 5 Manhattan 83 27
## 6 Manhattan 0 9
## review_scores_rating
## 1 98
## 2 95
## 3 NA
## 4 90
## 5 89
## 6 93
## transit
## 1 Very close to F and G trains and Express bus into NY. The B and Q are closeby also.
## 2 Apartment is located on 37th Street between 5th & 6th Avenue, just a few blocks from all subway connections. Closest Subways (in order of proximity to apartment (Website hidden by Airbnb) W: 34th Street & 6th Avenu (Website hidden by Airbnb) 3: 34th Street & 7th Avenue 7: 42nd & 5th Avenu (Website hidden by Airbnb) S: 42nd Street between Park & Lexington Avenue (Website hidden by Airbnb) E: 34th Street and 8th Avenue If coming by car, there is a parking garage on the block and free street parking.
## 3
## 4 B52 bus for a 10-minute ride to downtown Brooklyn is a few yards away on the corner; G train/Classon Avenue is 5 blocks away; C train is about 6 blocks to either the Clinton/Washington stop or Franklin Avenue stop. There is on-street parking, alternate side is twice per week on the immediate block but only once per week on Classon. From LaGuardia Airport, a taxi will cost $30-$35, but there is also a bus that will put you at the Jackson Heights subway station, and from there it's about 5 stops to catch the G train, which stops 5 blocks away. From JFK, the taxi is closer to $40, but the AirTran can get you conveniently to the A/C line and the C train is about 6 blocks from here. From JFK via subway/metro/train: From JFK take the AirTrain to Howard Beach to catch the A train toward Brooklyn/Manhattan. Take the A train to Utica Avenue and go across that same platform to catch the C local train (you could also transfer at Nostrand but you would have to carry luggage downstairs to cat
## 5 NYC subways and the Hudson River are each just a 10 min walk away.
## 6
Read the neighbourhoods.geojson of NYC neighborhoods file, and nybb of NYC boroughs files in.
# read the initial shape file with neighbourhoods and borough data
nyc_neighborhoods <- readOGR(dsn="data/neighbourhoods.geojson")
## OGR data source with driver: GeoJSON
## Source: "C:\Users\hs324\OneDrive\Desktop\Class_Files\04_2022Spring_CU\GR5063_DataViz\Assignment\hw02_AirbnbNYC\assignment-2-airbnb-HuanSunGo\data\neighbourhoods.geojson", layer: "neighbourhoods"
## with 233 features
## It has 2 fields
# convert the shape object so that it may be portrayed on a map
nyc_neighborhoods <- spTransform(nyc_neighborhoods,CRS("+proj=longlat +datum=WGS84"))
nyc_neighborhoods<-fortify(nyc_neighborhoods)
## Regions defined for each Polygons
nyc_boroughs <- readOGR("data/nyc_boroughs_map/.","nybb")
## OGR data source with driver: ESRI Shapefile
## Source: "C:\Users\hs324\OneDrive\Desktop\Class_Files\04_2022Spring_CU\GR5063_DataViz\Assignment\hw02_AirbnbNYC\assignment-2-airbnb-HuanSunGo\data\nyc_boroughs_map", layer: "nybb"
## with 5 features
## It has 4 fields
nyc_boroughs<- spTransform(nyc_boroughs,CRS("+proj=longlat +datum=WGS84"))
nyc_boroughs<- fortify(nyc_boroughs)
## Regions defined for each Polygons
# get nyc background map in with get_map
map_nyc <- get_map("New York City",zoom = 11,source = "stamen",maptype = "toner-lite")
## Source : https://maps.googleapis.com/maps/api/staticmap?center=New%20York%20City&zoom=11&size=640x640&scale=2&maptype=terrain&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=New+York+City&key=xxx
## Source : http://tile.stamen.com/toner-lite/11/601/768.png
## Source : http://tile.stamen.com/toner-lite/11/602/768.png
## Source : http://tile.stamen.com/toner-lite/11/603/768.png
## Source : http://tile.stamen.com/toner-lite/11/604/768.png
## Source : http://tile.stamen.com/toner-lite/11/601/769.png
## Source : http://tile.stamen.com/toner-lite/11/602/769.png
## Source : http://tile.stamen.com/toner-lite/11/603/769.png
## Source : http://tile.stamen.com/toner-lite/11/604/769.png
## Source : http://tile.stamen.com/toner-lite/11/601/770.png
## Source : http://tile.stamen.com/toner-lite/11/602/770.png
## Source : http://tile.stamen.com/toner-lite/11/603/770.png
## Source : http://tile.stamen.com/toner-lite/11/604/770.png
## Source : http://tile.stamen.com/toner-lite/11/601/771.png
## Source : http://tile.stamen.com/toner-lite/11/602/771.png
## Source : http://tile.stamen.com/toner-lite/11/603/771.png
## Source : http://tile.stamen.com/toner-lite/11/604/771.png
# plot the map with ggmap, which returns a ggplot object
g <- ggmap(map_nyc,
base_layer=ggplot(aes(x=long,y=lat),
legend=FALSE,
data=nyc_boroughs),
extent = "normal", maprange=FALSE)+
geom_polygon(aes(x=long, y=lat, group=group, fill=id,alpha=3),
size=0.5, color='#636363',alpha=0.3)+
geom_point(data=airbnbnyc,aes(x=longitude,y=latitude),size=0.3, alpha=0.1, color="#2c7fb8")+
theme_map()+
theme(legend.position = "none")
g
According to the map, the highly density area are mostly in Manhattan and Brooklyn, but we’ll get a closer look of the map with the density function in the below question.
# have a closer look at metropolitan area
map_nyc <- get_map("New York City",zoom = 12,source = "stamen",maptype = "toner-lite")
## Source : https://maps.googleapis.com/maps/api/staticmap?center=New%20York%20City&zoom=12&size=640x640&scale=2&maptype=terrain&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=New+York+City&key=xxx
## Source : http://tile.stamen.com/toner-lite/12/1204/1538.png
## Source : http://tile.stamen.com/toner-lite/12/1205/1538.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1538.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1538.png
## Source : http://tile.stamen.com/toner-lite/12/1204/1539.png
## Source : http://tile.stamen.com/toner-lite/12/1205/1539.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1539.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1539.png
## Source : http://tile.stamen.com/toner-lite/12/1204/1540.png
## Source : http://tile.stamen.com/toner-lite/12/1205/1540.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1540.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1540.png
## Source : http://tile.stamen.com/toner-lite/12/1204/1541.png
## Source : http://tile.stamen.com/toner-lite/12/1205/1541.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1541.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1541.png
# highlight the density of hot airbnb spots in nyc
density<-ggmap(map_nyc)+
stat_density2d(data = airbnbnyc, geom = "polygon",
aes(x = longitude, y = latitude, fill=..level.., alpha=..level..)) +
scale_fill_distiller(palette=4, direction=1) +
theme(legend.position="bottom")+
theme_map()+
ggplot2::annotate("text",x=-73.994508, y=40.763186, label="Hells Kitchen",
color="Dark Blue",fontface=4, size=3) +
ggplot2::annotate("text",x=-73.98736, y=40.72527, label="East Village",
color="Dark Blue",fontface=4, size=3)+
ggplot2::annotate("text",x=-73.951996, y=40.71200, label="Williamsburg",
color="Dark Blue",fontface=4, size=3)+
theme(legend.position = "none")
density
## Warning: Removed 10503 rows containing non-finite values (stat_density2d).
It is not surprising to find that the southwestern part of Central Park area and East Village area has most house rented as Airbnb, but also interesting to find that Willimsburg that as the uprising popular neighborhood had gained much attention.
An Airbnb host can set up a calendar for their listing so that it is only available for a few days or weeks a year. Other listings are available all year round (except for when it is already booked). Entire homes or apartments highly available and rented frequently year-round to tourists probably don’t have the owner present, are illegal, and more importantly, are displacing New Yorkers.
Hint: The variable availability_365: What part of the year is the property available to be rented is a possible choice to categorize rentals.
# set a definition for each case
airbnbnyc_case <- airbnbnyc %>%
mutate(case=case_when(airbnbnyc$availability_365 >= 270 ~ "Permanent",
airbnbnyc$availability_365 >= 180 ~ "Semi-Permanent",
airbnbnyc$availability_365 < 180 ~ "Sporadically"))
# check which neighborhood has the most permanent rental
permanent_count <- airbnbnyc_case %>%
group_by(neighbourhood_cleansed,case)%>%
summarize(nbh_number=n())%>%
mutate(group_sum=sum(nbh_number),by=neighbourhood_cleansed)%>%
arrange(desc(group_sum))%>%
rename(neighborhood=neighbourhood_cleansed,
neighbor_case_total=nbh_number,
neighbor_total=group_sum)%>%
select(-by)
## `summarise()` has grouped output by 'neighbourhood_cleansed'. You can override using the `.groups` argument.
head(permanent_count)
## # A tibble: 6 x 4
## # Groups: neighborhood [2]
## neighborhood case neighbor_case_total neighbor_total
## <chr> <chr> <int> <int>
## 1 Williamsburg Permanent 439 3944
## 2 Williamsburg Semi-Permanent 280 3944
## 3 Williamsburg Sporadically 3225 3944
## 4 Bedford-Stuyvesant Permanent 732 3708
## 5 Bedford-Stuyvesant Semi-Permanent 411 3708
## 6 Bedford-Stuyvesant Sporadically 2565 3708
# create a table for presentation
datatable(permanent_count,
rownames=FALSE,colnames=c("Neighborhood","Case","Case Number in Neighborhood","Total Number in Neighborhood"),
caption=htmltools::tags$caption("Listing Availability: Sporadically or Year-Round"),
#options=list(autoWidth = TRUE, dom = "ft", pageLength=10),
filter = list(position="top"))
# select the top 10 neighborhoods with most listings
permanent_count_top10 <- permanent_count%>%
head(30)
# make a stacked bar chart for comparision
ggplot(data=permanent_count_top10,
aes(x=reorder(neighborhood,neighbor_total),y=neighbor_total,
fill=case))+
geom_bar(stat="identity")+
theme_classic()+
labs(x="Number of Listing", y='Neighborhood', title='Listing Availability: Sporadically or Year-Round')+
theme(plot.title=element_text(hjust=0.5))+
coord_flip()+
theme_tufte(base_size = 13) +
scale_fill_manual(values=c('#ece2f0','#a6bddb','#1c9099'))
g <- ggmap(map_nyc)+
stat_density2d(data = airbnbnyc_case, geom = "polygon",
aes(x = longitude, y = latitude, fill=..level.., alpha=..level..)) +
scale_fill_distiller(palette=4, direction=1) +
theme_map()+
facet_wrap(~case)+
theme(legend.position="bottom")
g
## Warning: Removed 10503 rows containing non-finite values (stat_density2d).
pal = colorFactor("Set2", domain = airbnbnyc_case$case)
color_case=pal(airbnbnyc_case$case)
leaflet(airbnbnyc)%>%
addProviderTiles("Stamen.TonerLite") %>%
addCircles(color=color_case)%>%
addLegend(pal = pal, values = ~airbnbnyc_case$case, title = "Case")
## Assuming "longitude" and "latitude" are longitude and latitude, respectively
host_id) operate multiple rentals. Provide a data table of the the top hosts, and the followings:# 1) calculate the number of listing each host has
host_listing_number<-airbnbnyc%>%
group_by(host_id)%>%
summarize(number=n())%>%
arrange(desc(number))
# convert the price column in numeric types
airbnbnyc$price=gsub("\\$","",airbnbnyc$price)
airbnbnyc$price<-as.numeric((airbnbnyc$price))
## Warning: NAs introduced by coercion
# 2) calculate the average nightly price
host_nightly_avg<- airbnbnyc%>%
group_by(host_id)%>%
summarize(nightly_avg=round(mean(price),2))%>%
arrange(desc(nightly_avg))
# join the above three newly created columns back onto the airbnbnyc file
host_info<- airbnbnyc%>%
left_join(host_listing_number,by=c("host_id"))%>%
left_join(host_nightly_avg,by=c("host_id"))%>%
select(host_id,availability_365,number,nightly_avg)%>%
# here I think when calculating the estimate income, should only consider the days when the house is available to rent
#3) calculate the estimate average monthly total income from the listings: should be the avg*(available_365)/12
mutate(monthly_income=round(number*nightly_avg*availability_365/12),2)%>%
select(host_id,number,nightly_avg,monthly_income)%>%
group_by(host_id,number,nightly_avg)%>%
mutate(monthly_income=round(mean(monthly_income),2))%>%
arrange(desc(monthly_income))%>%
distinct()
# create the datatable
datatable(host_info,
rownames=FALSE,colnames=c("Host Id","Listing Count","Nightly Avg Pirce","Monthly Avg Income"),
caption=htmltools::tags$caption("Fun Facts about the Airbnb Hosts"),
#options=list(autoWidth = TRUE, dom = "ft", pageLength=10),
filter = list(position="top"))
The map should differentiate these two groups and upon clicking on a point on the map should show some basic information (at least 3 pieces of information) in a tool tip.
# get the info of top100 most expensive and best review rentals
airbnbnyc_rank<-airbnbnyc%>%
arrange(desc(price))%>%
mutate(price_rank=row_number())%>%
arrange(desc(review_scores_rating))%>%
mutate(review_rank=row_number())
head(airbnbnyc_rank)
## id host_id host_listings_count latitude longitude room_type
## 1 6969473 30247261 1 40.72039 -73.99683 Entire home/apt
## 2 12040331 64454893 2 40.73426 -73.99476 Entire home/apt
## 3 14590476 90256030 1 40.74775 -73.99167 Entire home/apt
## 4 27047594 319077 5 40.68550 -73.96112 Entire home/apt
## 5 32235802 241889662 4 40.73742 -74.00484 Private room
## 6 32327326 241889662 4 40.73697 -74.00343 Private room
## accommodates bathrooms bedrooms price neighbourhood_cleansed
## 1 8 3.0 3 999 Little Italy
## 2 4 2.5 2 999 Greenwich Village
## 3 7 2.5 2 999 Chelsea
## 4 6 1.0 0 999 Clinton Hill
## 5 2 1.0 0 999 West Village
## 6 2 1.0 0 999 West Village
## neighbourhood_group_cleansed availability_365 number_of_reviews
## 1 Manhattan 0 1
## 2 Manhattan 166 11
## 3 Manhattan 0 1
## 4 Brooklyn 363 2
## 5 Manhattan 329 1
## 6 Manhattan 332 2
## review_scores_rating
## 1 100
## 2 100
## 3 100
## 4 100
## 5 100
## 6 100
## transit
## 1 almost every subway line at walking distance of the home.
## 2 Many subways around.
## 3 2 min walk to the 1, 2, N, Q, & R Trains. 5 min walk to the B,D,F,M Only 5 blocks from The Empire State Building and Herald Square!! Walking distance to Madison Square Garden, Bryant Park, & Madison Square Park.
## 4 Conveniently located next to the R train, 12-minute walk to the F and G lines and Atlantic Terminal hub (2, 3, 4, 5 , B, D, N, Q , R and W, and Long Island Railroad)
## 5 We recommend a yellow taxi in from airports or car service (call ahead) for in and out to airports. The closest Subway Station is at Eighth Avenue and 14th Street and the blue line subways A, C, and E trains. There is on-street parking available, however, we recommend two nearby Parking Garages.
## 6
## price_rank review_rank
## 1 2 1
## 2 5 2
## 3 7 3
## 4 9 4
## 5 12 5
## 6 14 6
leaflet(airbnbnyc_rank)%>%
# prepare the base layers
addTiles(group = "Toner") %>%
addProviderTiles(providers$Stamen.TonerLite, group="Toner Lite") %>%
# add first layer
addCircles(group="Top 100 Most Expensive",
data=subset(airbnbnyc_rank,airbnbnyc_rank$price_rank < 101),
opacity = 1.0, stroke = TRUE,
color = "#af8dc3", weight=1,
popup = paste("AirBnb Id:",airbnbnyc_rank$id,"<br/>",
"Nightly Price:", airbnbnyc_rank$price, "$","<br/>",
"Accomodate:", airbnbnyc_rank$accommodates, "People"))%>%
# add second layer
addCircles(group="Top 100 Best Reviews",
data=subset(airbnbnyc_rank,airbnbnyc_rank$review_rank < 101),
opacity = 1.0, stroke = TRUE,
color = "#7fbf7b", weight=1,
popup = paste("AirBnb Id:",airbnbnyc_rank$id,"<br/>",
"Nightly Price:", airbnbnyc_rank$price, "$","<br/>",
"Accomodate:", airbnbnyc_rank$accommodates, "People"))%>%
# add layer control
addLayersControl(
baseGroups = c("OpenStreetMap","Toner Lite"),
overlayGroups = c("Top 100 Most Expensive","Top 100 Best Reviews"),
options = layersControlOptions(collapsed = TRUE) )
## Assuming "longitude" and "latitude" are longitude and latitude, respectively
## Assuming "longitude" and "latitude" are longitude and latitude, respectively